import pandas as pd
import re, datetime

data = pd.read_csv('personal_history/最后的数据4.csv', parse_dates=['时间'])
print('原始数据总量:',data.shape)
data = data.fillna(0)

def data_clean(str):
    if (str=='收藏') or (str=='分享') or (str=='投币') or (str=='点赞'):
        str = int(0)
    if type(str) == type(''):
        if '万' in str:
            str = int(float(str.split('万')[0]) * 10000)
        else:
            str = int(str)
    return int(str)

def label_clean(str):
    return ''.join(re.sub("\[|\]|'", '', str).split(','))

data['收藏数'] = data['收藏数'].apply(lambda x : data_clean(x))
data['转发数'] = data['转发数'].apply(lambda x : data_clean(x))
data['投币数'] = data['投币数'].apply(lambda x : data_clean(x))
data['点赞数'] = data['点赞数'].apply(lambda x : data_clean(x))
data['作者粉丝'] = data['作者粉丝'].apply(lambda x : data_clean(x))
data['标签'] = data['标签'].apply(lambda x : label_clean(x))

num_today = 0
for i in range(len(data['时间'])-1):
    if data['时间'][i] < data['时间'][i+1]:
        num_today = i + 1
        break
num_yesterday = 0
for i in range(len(data['时间'])-1):
    if data['时间'][i] >= pd.Timestamp(datetime.date.today()).date():
        num_yesterday = i

data.loc[num_today:num_yesterday,'时间'] = data.loc[num_today:num_yesterday,'时间'] - datetime.timedelta(1)

data_full = data[data['标签'].apply(lambda x :len(x)>=1)]
print('清理过后的数据总量：',data_full.shape)
data_full.to_csv('personal_history/清理后的个人历史数据（非直播）.csv',encoding='utf_8_sig',index=False)